Notes:
Notes:
library(ggplot2)
pf = read.csv('E:\\Study\\coursera\\Udacity\\ud651DataAnalysisWithR\\pseudo_facebook.tsv', sep = '\t')
qplot(x = age, y = friend_count, data = pf)
qplot(age, friend_count, data = pf)
Response:
Notes:
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point()
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point() + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes:
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(alpha = 1/20) + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_jitter(alpha = 1/20) + xlim(13,90)
## Warning: Removed 5178 rows containing missing values (geom_point).
Response:
Notes:
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(alpha = 1/20) + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(alpha = 1/20, position = position_jitter(h = 0)) + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 5182 rows containing missing values (geom_point).
Notes:
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20) + xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = 'jitter') + xlim(13,90)
## Warning: Removed 5162 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0)) + xlim(13,90)
## Warning: Removed 5178 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0)) + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 5178 rows containing missing values (geom_point).
Notes:
Notes:
#install.packages('dplyr')
library('dplyr')
## Warning: package 'dplyr' was built under R version 3.3.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#filter()
#group_by()
#mutate()
#arrange()
age_groups = group_by(pf,age)
pf.fc_by_age = summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age = arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
pf.fc_by_age = pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age)
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
Create your plot!
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean)) + geom_point()
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean)) + geom_line()
Notes:
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt')
## Warning: Removed 5186 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean)
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5188 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' )
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5177 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' )
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5168 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median, color = 'blue' )
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5189 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + xlim(13,90) + coord_trans(y = 'sqrt') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median, color = 'blue' ) + coord_cartesian(xlim = c(13,40))
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5167 rows containing missing values (geom_point).
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median, color = 'blue' ) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000))
Response:
See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.
Notes:
Notes:
cor.test(pf$age, pf$friend_count, method = 'pearson')
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
Look up the documentation for the cor.test function.
What’s the correlation between age and friend count? Round to three decimal places. Response:
Notes:
with(subset(pf, age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman'))
## Warning in cor.test.default(age, friend_count, method = "spearman"): Cannot
## compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2552934
Notes:
#ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median, color = 'blue' ) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000))
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point()
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point() + xlim(0,quantile(pf$www_likes_received,0.95)) + ylim(0,quantile(pf$likes_received,0.95)) + geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
Notes:
with(pf, cor.test(www_likes_received, likes_received))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
Response:
Notes:
Notes:
#install.packages('alr3')
library(alr3)
## Warning: package 'alr3' was built under R version 3.3.1
## Loading required package: car
## Warning: package 'car' was built under R version 3.3.1
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data("Mitchell")
?Mitchell
## starting httpd help server ...
## done
Create your plot!
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point()
Take a guess for the correlation coefficient for the scatterplot.
What is the actual correlation of the two variables? (Round to the thousandths place)
cor.test(Mitchell$Temp, Mitchell$Month)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Temp and Mitchell$Month
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Notes:
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point() + scale_x_continuous(breaks = 12*(1:17))
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point() + scale_x_continuous(breaks =seq(0,203,12))
What do you notice? Response:
Watch the solution video and check out the Instructor Notes! Notes:
Notes:
#ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median, color = 'blue' ) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000))
# Create a new variable, 'age_with_months', in the 'pf' data frame.
# Be sure to save the variable in the data frame rather than creating
# a separate, stand-alone variable. You will need to use the variables
# 'age' and 'dob_month' to create the variable 'age_with_months'.
# Assume the reference date for calculating age is December 31, 2013.
pf$age_with_months = (12 - pf$dob_month)/12 + pf$age
ggplot(data = pf, aes(x = age_with_months, y = friendships_initiated)) + geom_point(alpha = 1/20, position = position_jitter(h = 0), color = 'orange') + geom_line(stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .1), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = quantile, fun.args = (probs = .9), linetype = 2, color = 'blue' ) + geom_line(stat = 'summary', fun.y = median, color = 'blue' )
age_with_month_groups = group_by(pf,age_with_months)
pf.fc_by_age_months = summarise(age_with_month_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months = arrange(pf.fc_by_age_months)
ggplot(data = pf.fc_by_age_months, aes(x=age_with_months, y = friend_count_mean)) +geom_point()
ggplot(data = pf.fc_by_age_months, aes(x=age_with_months, y = friend_count_mean)) +geom_line() + coord_cartesian(xlim = c(13,70))
Programming Assignment
Notes:
ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line()
ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line()
p1 = ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line()
p2 = ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p2,p1,ncol=1)
p1 = ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line()
p2 = ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line()
p3 = ggplot(data = subset(pf, age <71), aes(x=round(age/5)*5, y = friend_count)) +geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)
p1 = ggplot(data = subset(pf.fc_by_age_months, age_with_months <71), aes(x=age_with_months, y = friend_count_mean)) +geom_line() + geom_smooth()
p2 = ggplot(data = subset(pf.fc_by_age, age <71), aes(x=age, y = friend_count_mean)) +geom_line() + geom_smooth()
p3 = ggplot(data = subset(pf, age <71), aes(x=round(age/5)*5, y = friend_count)) +geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p1,p2,p3,ncol=1)
Notes:
Reflection:
Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!